/*******************************************************************************
  ARTICLE	GAY, GOBBI, GONI (2025) "REVOLUTIONARY TRANSITIONS. INHERITANCE    
            CHANGE AND FERTILITY DECLINE" JOURNAL OF POLITICAL ECONOMY         
                                                                               
  AUTHORS	VICTOR GAY, PAULA GOBBI, MARC GONI                                 
  CONTACT	victor.gay@tse-fr.eu; paula.eugenia.gobbi@ulb.be; marc.goni@uib.no 
  VERSION	1.0 (MAY 2025)                                                     
  SOFTWARE	STATA SE 18                                                        
  LICENCE	MIT                                                                
--------------------------------------------------------------------------------

FAMILINX DATA PREPARATION DO FILE (for Appendix Figure B2)

This file combines the Familinx data files and generates the familinx annual fertility series used in Appendix Figure B2.

Open do-files from directory where they are placed; order matters; run whole code.
	
Do-file structure: 
------------------
	0. Program setup
	0. Info on accessing Familinx data
	1. Import raw files
	2. Create fertility variable and merge profiles-anon with relations-anon
	3. Multiple-generation links for H-sample
	4. Variables
	5. Add H-sample filter
	6. Data for figure B6

Main sources: 
-------------
	Familinx (https://familinx.org)	 		 
*/
********************************************************************************

********************
* 0. PROGRAM SETUP *
********************

version 18
clear all
set more off

************************
* PACKAGE DEPENDENCIES *
************************

***************
* DIRECTORIES *
***************

global TEMP "../2_0_tempfiles"

timer on 1

* ==============================================================================
* 0. Get access to familinx data and place files "relations-anon.txt" and "profiles-anon.txt" in folder /1_raw_data/1_19_familinx. The familinx data is a sub-sample of the Geni database scrapped by Kaplanis et al. (2018). At the time our project started, it could be downloaded from https://familinx.org (accessed in February 2017).
* ==============================================================================

* ==============================================================================
* 1. Import raw files to Stata
* ==============================================================================

* profiles-anon
* -------------
* Note: First use "Text File Splitter" (https://apps.apple.com/us/app/text-file-splitter/id831715956?mt=12&ign-mpt=uo%3D5) to break /1_raw_data/1_19_familinx/profiles-anon.txt into 151 parts, named part1, part2, ..., part151, respectively. Place these files in folder 2_scripts/2_0_tempfiles
/*
* Save part1 to part151 in stata format
foreach x of numlist 1/151 {
import delimited "$TEMP/part`x'.txt"
gen first_two_digits_birth = substr(birth_location_country, 1, 2)  // to drop obs outside france/belgium
gen first_two_digits_baptism = substr(baptism_location_country, 1, 2) 
keep if first_two_digits_birth=="Be" |first_two_digits_birth=="BE"|first_two_digits_birth=="FR"|first_two_digits_birth=="Fr" |first_two_digits_birth=="fr" |first_two_digits_birth=="be" | first_two_digits_baptism=="Be" |first_two_digits_baptism=="BE"|first_two_digits_baptism=="FR"|first_two_digits_baptism=="Fr" |first_two_digits_baptism=="fr" |first_two_digits_baptism=="be" 
drop if birth_location_country=="Belarus" | birth_location_country=="Bessarabien" | birth_location_country=="Bermuda"| birth_location_country=="Bessarabia"
keep profileid-cause_of_death
save "$TEMP/part`x'.dta"
clear
}

* Append them together
cd "$TEMP"
clear
fs *.dta
append using `r(files)', force

* Drop some missplaced variables (after v75)
drop cause_of_death9109-cause_of_deathle

* Change * to missing
ds profileid-cause_of_death, has(type string)
    foreach var of varlist `r(varlist)' {
        replace `var' = "." if strpos(`var',"*")
        destring `var', replace
 }
 
rename v7 current_res_loc_country
rename v8 current_res_loc_country_code
rename v14 current_res_res_ext_type
rename v59 burial_loc_res_ext_type
rename v73 baptism_loc_res_ext_confidence
rename v74 baptism_loc_res_ext_type

save "geni-frbe.dta", replace
cd "../2_1_data"
*/
* relations-anon
* --------------

import delimited "../../1_raw_data/1_19_familinx/relations-anon.txt", clear 
save "$TEMP/geni_links.dta", replace


* ==============================================================================
* 2. Create fertility variable and merge profiles-anon with relations-anon
* ==============================================================================

* Create fertility variable
clear all
use "$TEMP/geni_links.dta"

set more on
bys parent: gen fertility = _N 
collapse fertility, by(parent)
rename parent profileid
tempfile fertility
save `fertility', replace

* Clean profiles-anon data
use "$TEMP/geni-frbe.dta", clear

* Bruxelles was written in Russian
replace birth_location_city="Bruxelles" if birth_location_city=="Брюксел"

* Harmonize the "birth_location_country" name to have 1 name for France and 1 name for Belgium.
gen first_two_digits_birth = substr(birth_location_country, 1, 2)  
gen countryofbirth="."
replace countryofbirth="Belgium" if  first_two_digits_birth=="Be" |first_two_digits_birth=="BE"|first_two_digits_birth=="be" 
replace countryofbirth="France"  if first_two_digits_birth=="FR"|first_two_digits_birth=="Fr" |first_two_digits_birth=="fr" 

* Check whether there are differences between "birth_location_country" and "baptism_location_country". If there is information on "baptism" that is not in "birth", then impute the baptism information on that of birth.
gen countryofbaptism="."
gen first_two_digits_baptism = substr(baptism_location_country, 1, 2) 
replace countryofbaptism="Belgium" if  first_two_digits_baptism=="Be" |first_two_digits_baptism=="BE"| first_two_digits_baptism=="be"
replace countryofbaptism="France"  if first_two_digits_baptism=="FR"|first_two_digits_baptism=="Fr" |first_two_digits_baptism=="fr" 
gen country_birth_or_baptism = countryofbirth
replace country_birth_or_baptism = countryofbaptism if countryofbirth=="."
encode country_birth_or_baptism, generate(country_birth_baptism)
drop country_birth_or_baptism

*Keep only people born between 1500-1910.
gen birth_year_n=birth_year
replace birth_year_n=1714 if birth_year==11714
replace birth_year_n=1901 if birth_year==19010
replace birth_year_n=1766 if birth_year==31766

* Add info in birth_date_text: the year is always the last numbers
gen  textyear=substr(birth_date_text,-4,4)
destring textyear, replace force
replace birth_year_n= textyear if birth_year==.
drop if birth_year_n<1500 | birth_year_n>1910
drop textyear

* Merge
merge 1:1 profileid using `fertility'


* ==============================================================================
* 3. Multiple-generation links for H-sample
* ------------------------------------------------------------------------------

* Fertility to merge with different generations
clear all
use "$TEMP/geni_links.dta"
set more on
bys parent: gen fertility = _N 
collapse fertility, by(parent)
rename parent id_g1_1
rename fertility fert_g1_1
tempfile fertg1_1
save `fertg1_1', replace
rename id_g1_1 id_g1_2
rename fert_g1_1 fert_g1_2
tempfile fertg1_2
save `fertg1_2', replace


* ID FOR PARENTS
* ---------------
* parents id numbers
use "$TEMP/geni_links.dta", clear
rename parent id_g1_
rename child profileid
sort profileid id_g1_
by profileid: gen j = _n
reshape wide id_g1_, i(profileid) j(j)
* parents' fertility
forvalues i=1/2{
merge m:1 id_g1_`i' using `fertg1_`i'', keep(mas mat) nogen
replace fert_g1_`i' = 0 if fert_g1_`i'==.
}
* save
tempfile gen0
save `gen0', replace


* ID FOR GRANDPARENTS
* -------------------
* SIDE 1
* grandparents id numbers
rename id_g1_1 id_g2_1
rename fert_g1_1 fert_g2_1 
rename id_g1_2 id_g2_2
rename fert_g1_2 fert_g2_2 
rename profileid id_g1_1
merge 1:m id_g1_1 using `gen0', keep(us mat) nogen
* save
tempfile gen0
save `gen0', replace

* SIDE 2
keep profileid id_g1_1 id_g1_2 fert_g1_1 fert_g1_2
* grandparents id numbers
rename id_g1_1 id_g2_3
rename fert_g1_1 fert_g2_3
rename id_g1_2 id_g2_4
rename fert_g1_2 fert_g2_4
rename profileid id_g1_2
merge 1:m id_g1_2 using `gen0', keep(us mat) nogen
* save
tempfile gen0
save `gen0', replace


* ID FOR GREAT-GRANDPARENTS
* -------------------------
* SIDE 1
keep profileid id_g1_1 id_g1_2 fert_g1_1 fert_g1_2
* great-grandparents id number
rename id_g1_1 id_g3_1
rename fert_g1_1 fert_g3_1 
rename id_g1_2 id_g3_2
rename fert_g1_2 fert_g3_2 
rename profileid id_g2_1
merge 1:m id_g2_1 using `gen0', keep(us mat) nogen
* save
tempfile gen0
save `gen0', replace

* SIDE 2
keep profileid id_g1_1 id_g1_2 fert_g1_1 fert_g1_2
* great-grandparents id number
rename id_g1_1 id_g3_3
rename fert_g1_1 fert_g3_3
rename id_g1_2 id_g3_4
rename fert_g1_2 fert_g3_4
rename profileid id_g2_2
merge 1:m id_g2_2 using `gen0', keep(us mat) nogen
* save
tempfile gen0
save `gen0', replace

* SIDE 3
keep profileid id_g1_1 id_g1_2 fert_g1_1 fert_g1_2
* great-grandparents id number
rename id_g1_1 id_g3_5
rename fert_g1_1 fert_g3_5
rename id_g1_2 id_g3_6
rename fert_g1_2 fert_g3_6
rename profileid id_g2_3
merge 1:m id_g2_3 using `gen0', keep(us mat) nogen
* save
tempfile gen0
save `gen0', replace

* SIDE 4
keep profileid id_g1_1 id_g1_2 fert_g1_1 fert_g1_2
* great-grandparents id number
rename id_g1_1 id_g3_7
rename fert_g1_1 fert_g3_7
rename id_g1_2 id_g3_8
rename fert_g1_2 fert_g3_8
rename profileid id_g2_4
merge 1:m id_g2_4 using `gen0', keep(us mat) nogen
* save
tempfile gen0
save `gen0', replace


* ID FOR GREAT-GREAT-GRANDPARENTS
* -------------------------------
* SIDES 1-8
forvalues i=1/8{
local j1 = 2*`i'-1
local j2 = 2*`i'
keep profileid id_g1_1 id_g1_2 fert_g1_1 fert_g1_2
* great-great-grandparents id number
rename id_g1_1 id_g4_`j1'
rename fert_g1_1 fert_g4_`j1'
rename id_g1_2 id_g4_`j2'
rename fert_g1_2 fert_g4_`j2'
rename profileid id_g3_`i'
merge 1:m id_g3_`i' using `gen0', keep(us mat) nogen
* save
tempfile gen0
save `gen0', replace
}

order profileid ///
	  id_g1_1 id_g1_2 ///
	  id_g2_1 id_g2_2 id_g2_3 id_g2_4 ///
	  id_g3_1 id_g3_2 id_g3_3 id_g3_4 id_g3_5 id_g3_6 id_g3_7 id_g3_8 ///
	  id_g4_1 id_g4_2 id_g4_3 id_g4_4 id_g4_5 id_g4_6 id_g4_7 id_g4_8 id_g4_9 id_g4_10 id_g4_11 id_g4_12 id_g4_13 id_g4_14 id_g4_15 id_g4_16 ///
	  fert_g1_1 fert_g1_2 ///
	  fert_g2_1 fert_g2_2 fert_g2_3 fert_g2_4 ///
	  fert_g3_1 fert_g3_2 fert_g3_3 fert_g3_4 fert_g3_5 fert_g3_6 fert_g3_7 fert_g3_8 ///
	  fert_g4_1 fert_g4_2 fert_g4_3 fert_g4_4 fert_g4_5 fert_g4_6 fert_g4_7 fert_g4_8 fert_g4_9 fert_g4_10 fert_g4_11 fert_g4_12 fert_g4_13 fert_g4_14 fert_g4_15 fert_g4_16
	  
save "$TEMP/geni-links-4gen.dta", replace


* ==============================================================================
* 4. Variables
* ------------------------------------------------------------------------------

* COMPLETED FERTILITY
* -------------------
clear all
use "$TEMP/geni_links.dta"
format %20.0f parent
format %20.0f child
set more on
bys parent: gen fert = _N 
rename parent profileid
tempfile FERT0
save `FERT0', replace

* GENDER, BIRTH YEAR, COUNTRY
* ---------------------------
use "$TEMP/geni-frbe.dta", clear

* Country of birth, baptism, death (from coded data)
gen bfra  = (birth_location_country_code=="FR")
gen bpfra = (baptism_location_country_code=="FR")
gen dfra  = (death_location_country_code=="FR")
gen fra   = (bfra==1 | bpfra==1 | dfra==1)

* Add info in birth_date_text: the year is always the last numbers
gen byear=birth_year
gen textyear=substr(birth_date_text,-4,4)
destring textyear, replace force
replace byear= textyear if birth_year==.
replace byear=1714 if byear==11714
replace byear=1901 if byear==19010
replace byear=1766 if byear==31766
drop textyear

* Housekeeping
keep profileid gender byear birth_location_latitude birth_location_longitude bfra bpfra dfra fra
rename birth_location_latitude lat
rename birth_location_longitude lon

* merge
merge 1:m profileid using `FERT0', keep(mat) nogen
tempfile FERT0
save `FERT0', replace


* CHILD's GENDER, BIRTH YEAR, DEATH YEAR, COUNTRY
* ------------------------------------------------------------------------------
use "$TEMP/geni-frbe.dta", clear

* Country of birth, baptism, death (from coded data)
gen bfra  = (birth_location_country_code=="FR")
gen bpfra = (baptism_location_country_code=="FR")
gen dfra  = (death_location_country_code=="FR")
gen fra   = (bfra==1 | bpfra==1 | dfra==1)

* Add info in birth_date_text: the year is always the last numbers
gen byear=birth_year
gen textyear=substr(birth_date_text,-4,4)
destring textyear, replace force
replace byear= textyear if birth_year==.
replace byear=1714 if byear==11714
replace byear=1901 if byear==19010
replace byear=1766 if byear==31766
drop textyear

* housekeeping
keep profileid gender byear birth_location_latitude birth_location_longitude bfra bpfra dfra fra death_year
rename profileid child
rename gender ch_gender
rename byear ch_byear
rename death_year ch_dyear
rename bfra ch_bfra
rename bpfra ch_bpfra
rename dfra ch_dfra
rename fra ch_fra
rename birth_location_latitude ch_lat
rename birth_location_longitude ch_lon

* merge
merge 1:m child using `FERT0', keep(us mat) nogen
order profileid child fert gender lat lon bfra bpfra dfra fra byear

* other variables
sort profileid ch_byear
by profileid: gen year = ch_byear[1]
gsort profileid -ch_fra
by profileid: gen ch_france = ch_fra[1]


* ==============================================================================
* 5. Add H-sample filter
* ------------------------------------------------------------------------------
/* 	From Blanc (2020a): "By definition, only vertical lineages are complete in 
genealogies and horizontal lineages can be missing. In order to deal with this 
issue, I define the fertility sample, the sample with a recorded horizontal
lineage, by retaining only observations for which at least one parent in any of 
the four generations preceding an individual's observation is recorded as having 
a fertility rate that is strictly greater than one."	
*/
merge m:1 profileid using "$TEMP/geni-links-4gen.dta", keep(mas mat)
gen fsample = 0
forvalues i=1/4{
local k = 2^`i'
forvalues j=1/`k'{
replace fsample = 1 if fert_g`i'_`j'>1 & fert_g`i'_`j'!=.
}
}


* ==============================================================================
* 6. Data for figure B6
* ------------------------------------------------------------------------------

* sample
keep if fra==1 | ch_france==1
keep if fsample==1
keep if year!=.
replace ch_dyear = . if ch_dyear<ch_byear
gen ch_mort = (ch_dyear-ch_byear<6)
bys profileid: gen mort0 = sum(ch_mort)
bys profileid: egen mort = max(mort)
drop mort0
gen fert_net = fert-mort if gender=="female"
gen fert_wom = fert if gender=="female"
* collapse by parent
collapse fert_net fert_wom year byear, by(profileid)
* collapse by year
collapse fert_net fert_wom, by(byear)
*save familinx-year-series, replace

export delimited using "../../3_outputs/3_1_datasets/familinx-year-series.csv", replace

erase "$TEMP/geni-links-4gen.dta"
erase "$TEMP/geni_links.dta"
*erase "geni-frbe.dta"

timer off 1
timer list